In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [2346]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected UUUU
In [2347]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [2348]:
pd.set_option('display.max_colwidth', None)
In [2349]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [2350]:
del df['Unnamed: 0']
In [2351]:
df.head(5)
Out[2351]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-04-21 1.70 1.70 1.50 1.56 1.56 1898500 -7.142858 0.427948 0.011110 0.151998 1.755096 1.333476 1.544286 NaN 9.743490 0.20 55.655072 NaN NaN NaN 0.40 NaN 0.344828 72.625062 NaN NaN 60.787990 69.295001 4.388876e+06 6.003127e+05 13034300.0 0.0 1.077428e+04 0.0 0.0 0.0 0.0 0.0 1.077428e+04 0.0 1.077428e+04 0.0 1.077428e+04 0 14 14 0 14 0 14 14
1 2020-04-22 1.60 1.93 1.57 1.90 1.90 4013300 21.794875 0.400896 0.014241 0.183142 1.867241 1.389902 1.628571 NaN 9.639027 0.37 71.556426 NaN NaN NaN 0.61 NaN 0.472868 80.056186 NaN NaN 67.986219 67.260528 7.733293e+06 1.548974e+06 17047600.0 0.0 1.045735e+06 0.0 0.0 0.0 0.0 0.0 1.045735e+06 0.0 1.045735e+06 0.0 1.045735e+06 0 69 69 0 69 0 69 69
2 2020-04-23 2.10 2.35 1.76 1.91 1.91 11813800 0.526315 0.382563 0.021641 0.241264 1.975645 1.387212 1.681429 NaN 12.631631 0.59 83.874965 NaN NaN NaN 0.60 NaN 0.458015 80.226204 NaN NaN 55.781829 61.518679 1.926510e+06 -5.136529e+04 28861400.0 0.0 1.949656e+06 0.0 0.0 0.0 0.0 0.0 1.949656e+06 0.0 1.949656e+06 0.0 1.949656e+06 0 128 128 0 128 0 128 128
3 2020-04-24 2.01 2.01 1.81 1.86 1.86 2920600 -2.617799 0.270140 0.023514 0.235369 2.026687 1.413313 1.720000 NaN 12.654262 0.20 83.874965 NaN NaN NaN 0.57 NaN 0.441861 76.705189 NaN NaN 61.203835 61.657294 4.662115e+05 -1.166021e+06 25940800.0 0.0 8.281170e+05 0.0 0.0 0.0 0.0 0.0 8.281170e+05 0.0 8.281170e+05 0.0 8.281170e+05 0 45 45 0 45 0 45 45
4 2020-04-27 1.91 1.93 1.78 1.86 1.86 1913500 0.000000 0.297518 0.019596 0.223174 2.045685 1.485743 1.765714 NaN 11.998584 0.15 76.445611 NaN NaN NaN 0.55 NaN 0.419847 76.705189 NaN NaN 44.313730 53.766465 5.937796e+05 -1.475424e+06 25940800.0 0.0 3.325070e+05 0.0 0.0 0.0 0.0 0.0 3.325070e+05 0.0 3.325070e+05 0.0 3.325070e+05 0 15 15 0 15 0 15 15
In [2352]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       423 non-null    datetime64[ns]
 1   Open                       423 non-null    float64       
 2   High                       423 non-null    float64       
 3   Low                        423 non-null    float64       
 4   Close                      423 non-null    float64       
 5   Adj Close                  423 non-null    float64       
 6   Volume                     423 non-null    int64         
 7   Return                     423 non-null    float64       
 8   Beta                       423 non-null    float64       
 9   Variance                   423 non-null    float64       
 10  AvgTrueRange               423 non-null    float64       
 11  Upperband                  423 non-null    float64       
 12  Lowerband                  423 non-null    float64       
 13  Middleband                 423 non-null    float64       
 14  APO                        418 non-null    float64       
 15  NATR                       423 non-null    float64       
 16  TRANGE                     423 non-null    float64       
 17  DMI                        423 non-null    float64       
 18  MACD                       410 non-null    float64       
 19  MACDSIGNAL                 410 non-null    float64       
 20  MACDHIST                   410 non-null    float64       
 21  MOM                        423 non-null    float64       
 22  PPO                        418 non-null    float64       
 23  ROCP                       423 non-null    float64       
 24  RSI                        423 non-null    float64       
 25  TRIX                       355 non-null    float64       
 26  ULTOSC                     415 non-null    float64       
 27  SLOWK                      423 non-null    float64       
 28  SLOWD                      423 non-null    float64       
 29  AD                         423 non-null    float64       
 30  ADOSC                      423 non-null    float64       
 31  OBV                        423 non-null    float64       
 32  Upward_momentum_created    423 non-null    float64       
 33  Downward_momentum_created  423 non-null    float64       
 34  B5_O_Um                    423 non-null    float64       
 35  B5_C_Um                    423 non-null    float64       
 36  B5_E_Um                    423 non-null    float64       
 37  B5_A_Um                    423 non-null    float64       
 38  B5_N_Um                    423 non-null    float64       
 39  B5_O_Dm                    423 non-null    float64       
 40  B5_C_Dm                    423 non-null    float64       
 41  B5_E_Dm                    423 non-null    float64       
 42  B5_A_Dm                    423 non-null    float64       
 43  B5_N_Dm                    423 non-null    float64       
 44  Verified_status_True       423 non-null    int64         
 45  Verified_status_False      423 non-null    int64         
 46  O                          423 non-null    int64         
 47  C                          423 non-null    int64         
 48  E                          423 non-null    int64         
 49  A                          423 non-null    int64         
 50  N                          423 non-null    int64         
 51  Real_or_Fake_tweet         423 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 172.0 KB
In [2353]:
df.shape
Out[2353]:
(423, 52)
In [2354]:
sns.set(font_scale=0.8)
In [2355]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [2356]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [2357]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [2358]:
df.head()
Out[2358]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-04-21 1.70 1.70 1.50 1.56 1.56 1898500 -7.142858 0.427948 0.011110 0.151998 1.755096 1.333476 1.544286 NaN 9.743490 0.20 55.655072 NaN NaN NaN 0.40 NaN 0.344828 72.625062 NaN NaN 60.787990 69.295001 4.388876e+06 6.003127e+05 13034300.0 0.0 1.077428e+04 0.0 0.0 0.0 0.0 0.0 1.077428e+04 0.0 1.077428e+04 0.0 1.077428e+04 0 14 14 0 14 0 14 14 NaN NaN
1 2020-04-22 1.60 1.93 1.57 1.90 1.90 4013300 21.794875 0.400896 0.014241 0.183142 1.867241 1.389902 1.628571 NaN 9.639027 0.37 71.556426 NaN NaN NaN 0.61 NaN 0.472868 80.056186 NaN NaN 67.986219 67.260528 7.733293e+06 1.548974e+06 17047600.0 0.0 1.045735e+06 0.0 0.0 0.0 0.0 0.0 1.045735e+06 0.0 1.045735e+06 0.0 1.045735e+06 0 69 69 0 69 0 69 69 21.794875 0.197168
2 2020-04-23 2.10 2.35 1.76 1.91 1.91 11813800 0.526315 0.382563 0.021641 0.241264 1.975645 1.387212 1.681429 NaN 12.631631 0.59 83.874965 NaN NaN NaN 0.60 NaN 0.458015 80.226204 NaN NaN 55.781829 61.518679 1.926510e+06 -5.136529e+04 28861400.0 0.0 1.949656e+06 0.0 0.0 0.0 0.0 0.0 1.949656e+06 0.0 1.949656e+06 0.0 1.949656e+06 0 128 128 0 128 0 128 128 0.526315 0.005249
3 2020-04-24 2.01 2.01 1.81 1.86 1.86 2920600 -2.617799 0.270140 0.023514 0.235369 2.026687 1.413313 1.720000 NaN 12.654262 0.20 83.874965 NaN NaN NaN 0.57 NaN 0.441861 76.705189 NaN NaN 61.203835 61.657294 4.662115e+05 -1.166021e+06 25940800.0 0.0 8.281170e+05 0.0 0.0 0.0 0.0 0.0 8.281170e+05 0.0 8.281170e+05 0.0 8.281170e+05 0 45 45 0 45 0 45 45 -2.617799 -0.026527
4 2020-04-27 1.91 1.93 1.78 1.86 1.86 1913500 0.000000 0.297518 0.019596 0.223174 2.045685 1.485743 1.765714 NaN 11.998584 0.15 76.445611 NaN NaN NaN 0.55 NaN 0.419847 76.705189 NaN NaN 44.313730 53.766465 5.937796e+05 -1.475424e+06 25940800.0 0.0 3.325070e+05 0.0 0.0 0.0 0.0 0.0 3.325070e+05 0.0 3.325070e+05 0.0 3.325070e+05 0 15 15 0 15 0 15 15 0.000000 0.000000
In [2359]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [2360]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [2361]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [2362]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [2363]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [2364]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [2365]:
df.describe()
Out[2365]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 355.000000 355.000000 355.000000 355.000000 355.000000 3.550000e+02 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 3.550000e+02 3.550000e+02 3.550000e+02 355.0 3.550000e+02 355.0 355.0 355.0 355.0 355.0 3.550000e+02 355.0 3.550000e+02 355.0 3.550000e+02 355.000000 355.000000 355.000000 355.0 355.000000 355.0 355.000000 355.000000 355.000000 355.000000 326.000000 319.000000
mean 5.001296 5.187099 4.795099 4.994423 4.994423 3.648486e+06 0.534649 0.678715 0.097084 0.400812 5.449833 4.444074 4.946954 0.134787 7.766984 0.408873 33.718564 0.137676 0.141998 -0.004323 0.168535 3.280472 0.056186 54.304943 0.466406 50.160117 50.722076 50.817118 1.215156e+07 5.982451e+05 8.785687e+07 0.0 2.660801e+05 0.0 0.0 0.0 0.0 0.0 2.660801e+05 0.0 2.660801e+05 0.0 2.660801e+05 0.059155 44.061972 44.121127 0.0 44.121127 0.0 44.121127 44.121127 0.534649 0.003916 0.053330 0.053742
std 2.408832 2.499173 2.309760 2.407203 2.407203 2.484315e+06 5.416505 0.319741 0.143134 0.213214 2.667288 2.145850 2.392430 0.406492 1.776493 0.263777 24.586434 0.267180 0.239728 0.095513 0.888141 7.874844 0.175488 11.154106 0.550907 9.359925 24.881953 23.227869 2.342857e+07 2.444549e+06 2.708814e+07 0.0 2.960758e+05 0.0 0.0 0.0 0.0 0.0 2.960758e+05 0.0 2.960758e+05 0.0 2.960758e+05 0.269744 35.603618 35.717022 0.0 35.717022 0.0 35.717022 35.717022 5.416505 0.053035 0.013068 0.012907
min 1.450000 1.530000 1.420000 1.450000 1.450000 4.928000e+05 -13.976707 -0.153651 0.000453 0.075973 1.583366 1.396441 1.502857 -0.933782 4.627838 0.050000 0.041431 -0.462053 -0.339043 -0.275298 -2.080000 -10.533423 -0.249635 26.292084 -0.344263 25.112203 3.986488 8.049267 -2.262393e+07 -4.227492e+06 4.117650e+07 0.0 8.500000e+01 0.0 0.0 0.0 0.0 0.0 8.500000e+01 0.0 8.500000e+01 0.0 8.500000e+01 0.000000 2.000000 2.000000 0.0 2.000000 0.0 2.000000 2.000000 -13.976707 -0.150552 0.031915 0.031915
25% 2.170000 2.255000 2.050000 2.180000 2.180000 1.828000e+06 -3.097740 0.444000 0.011745 0.163435 2.265599 1.756148 2.042143 -0.059167 6.341419 0.205000 13.033242 -0.021686 -0.016840 -0.069470 -0.275000 -2.216487 -0.070849 46.481453 0.080703 43.914171 28.325922 31.185171 -1.266532e+07 -9.267465e+05 6.025675e+07 0.0 9.531788e+04 0.0 0.0 0.0 0.0 0.0 9.531788e+04 0.0 9.531788e+04 0.0 9.531788e+04 0.000000 21.000000 21.000000 0.0 21.000000 0.0 21.000000 21.000000 -3.097740 -0.031467 0.041293 0.041576
50% 5.360000 5.540000 5.160000 5.340000 5.340000 3.165900e+06 0.000000 0.667942 0.054257 0.402467 5.795863 4.864137 5.357143 0.055064 7.519420 0.390000 28.065619 0.060875 0.074059 -0.002881 0.030000 1.543300 0.006211 52.503675 0.291865 49.506390 51.515204 49.964831 1.424072e+07 -2.086426e+04 9.182500e+07 0.0 1.716309e+05 0.0 0.0 0.0 0.0 0.0 1.716309e+05 0.0 1.716309e+05 0.0 1.716309e+05 0.000000 35.000000 35.000000 0.0 35.000000 0.0 35.000000 35.000000 0.000000 0.000000 0.051194 0.051734
75% 6.495000 6.815000 6.255000 6.505000 6.505000 4.595350e+06 3.678486 0.856480 0.110290 0.587278 7.203764 5.760123 6.444286 0.376154 9.051771 0.560000 54.136604 0.321219 0.320305 0.050556 0.630000 8.105293 0.157590 60.606760 0.857693 56.989237 73.490659 72.458101 2.736544e+07 1.817347e+06 1.068549e+08 0.0 3.147136e+05 0.0 0.0 0.0 0.0 0.0 3.147136e+05 0.0 3.147136e+05 0.0 3.147136e+05 0.000000 56.000000 56.000000 0.0 56.000000 0.0 56.000000 56.000000 3.678486 0.036124 0.063612 0.063958
max 11.100000 11.390000 10.680000 10.990000 10.990000 1.877440e+07 21.283779 2.050666 1.108000 0.799723 11.758292 10.057197 10.684286 1.101987 14.908517 1.340000 94.827860 0.947579 0.807882 0.283740 3.150000 24.675267 0.656566 83.744673 1.782552 79.215986 97.875010 96.361512 6.765090e+07 8.584144e+06 1.471308e+08 0.0 2.083136e+06 0.0 0.0 0.0 0.0 0.0 2.083136e+06 0.0 2.083136e+06 0.0 2.083136e+06 2.000000 238.000000 240.000000 0.0 240.000000 0.0 240.000000 240.000000 21.283779 0.192963 0.083973 0.083973
In [2366]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2367]:
df = df.fillna(df.median())
In [2368]:
df.isna().sum()
Out[2368]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2369]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 355 entries, 68 to 422
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       355 non-null    datetime64[ns]
 1   Open                       355 non-null    float64       
 2   High                       355 non-null    float64       
 3   Low                        355 non-null    float64       
 4   Close                      355 non-null    float64       
 5   Adj Close                  355 non-null    float64       
 6   Volume                     355 non-null    int64         
 7   Return                     355 non-null    float64       
 8   Beta                       355 non-null    float64       
 9   Variance                   355 non-null    float64       
 10  AvgTrueRange               355 non-null    float64       
 11  Upperband                  355 non-null    float64       
 12  Lowerband                  355 non-null    float64       
 13  Middleband                 355 non-null    float64       
 14  APO                        355 non-null    float64       
 15  NATR                       355 non-null    float64       
 16  TRANGE                     355 non-null    float64       
 17  DMI                        355 non-null    float64       
 18  MACD                       355 non-null    float64       
 19  MACDSIGNAL                 355 non-null    float64       
 20  MACDHIST                   355 non-null    float64       
 21  MOM                        355 non-null    float64       
 22  PPO                        355 non-null    float64       
 23  ROCP                       355 non-null    float64       
 24  RSI                        355 non-null    float64       
 25  TRIX                       355 non-null    float64       
 26  ULTOSC                     355 non-null    float64       
 27  SLOWK                      355 non-null    float64       
 28  SLOWD                      355 non-null    float64       
 29  AD                         355 non-null    float64       
 30  ADOSC                      355 non-null    float64       
 31  OBV                        355 non-null    float64       
 32  Upward_momentum_created    355 non-null    float64       
 33  Downward_momentum_created  355 non-null    float64       
 34  B5_O_Um                    355 non-null    float64       
 35  B5_C_Um                    355 non-null    float64       
 36  B5_E_Um                    355 non-null    float64       
 37  B5_A_Um                    355 non-null    float64       
 38  B5_N_Um                    355 non-null    float64       
 39  B5_O_Dm                    355 non-null    float64       
 40  B5_C_Dm                    355 non-null    float64       
 41  B5_E_Dm                    355 non-null    float64       
 42  B5_A_Dm                    355 non-null    float64       
 43  B5_N_Dm                    355 non-null    float64       
 44  Verified_status_True       355 non-null    int64         
 45  Verified_status_False      355 non-null    int64         
 46  O                          355 non-null    int64         
 47  C                          355 non-null    int64         
 48  E                          355 non-null    int64         
 49  A                          355 non-null    int64         
 50  N                          355 non-null    int64         
 51  Fake_news                  355 non-null    int64         
 52  returns                    355 non-null    float64       
 53  log_returns                355 non-null    float64       
 54  vol_current                355 non-null    float64       
 55  vol_future                 355 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 158.1 KB
In [2370]:
df.shape
Out[2370]:
(355, 56)
In [2371]:
df=df.dropna()
In [2372]:
df.dtypes
Out[2372]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2373]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[2373]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077448d9d0>
In [2374]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [2375]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
Upperband       0.936405
High            0.929068
OBV             0.923125
Middleband      0.923111
Open            0.922536
Adj Close       0.920110
Close           0.920110
Low             0.915057
Lowerband       0.894421
AD              0.883944
TRANGE          0.833429
NATR            0.628781
Variance        0.592964
MACDSIGNAL      0.579057
vol_current     0.572316
Volume          0.536846
TRIX            0.504563
Name: AvgTrueRange, dtype: float64
In [2376]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 8 strongly correlated values with NATR :
NATR            1.000000
vol_current     0.802117
vol_future      0.788182
TRIX            0.743856
AvgTrueRange    0.628781
OBV             0.563721
TRANGE          0.536828
MACDSIGNAL      0.521450
Name: NATR, dtype: float64
In [2377]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with TRANGE:
TRANGE                   1.000000
AvgTrueRange             0.833429
OBV                      0.788171
High                     0.778333
Volume                   0.766161
Close                    0.760791
Adj Close                0.760791
Upperband                0.754188
Open                     0.750791
Low                      0.735336
AD                       0.731219
Middleband               0.730668
Verified_status_False    0.698532
N                        0.698285
O                        0.698285
E                        0.698285
Fake_news                0.698285
Lowerband                0.691804
Variance                 0.556942
NATR                     0.536828
MACDSIGNAL               0.507934
Name: TRANGE, dtype: float64
In [2378]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999976
Volume                       0.861133
B5_E_Dm                      0.772291
B5_O_Dm                      0.772291
B5_N_Dm                      0.772291
Downward_momentum_created    0.772291
TRANGE                       0.698285
RSI                          0.564619
Variance                     0.556972
ADOSC                        0.548940
OBV                          0.545842
ROCP                         0.535579
MOM                          0.523893
Name: O, dtype: float64
In [2379]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [2380]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999976
Volume                       0.861133
B5_E_Dm                      0.772291
B5_O_Dm                      0.772291
B5_N_Dm                      0.772291
Downward_momentum_created    0.772291
TRANGE                       0.698285
RSI                          0.564619
Variance                     0.556972
ADOSC                        0.548940
OBV                          0.545842
ROCP                         0.535579
MOM                          0.523893
Name: E, dtype: float64
In [2381]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [2382]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999976
Volume                       0.861133
B5_E_Dm                      0.772291
B5_O_Dm                      0.772291
B5_N_Dm                      0.772291
Downward_momentum_created    0.772291
TRANGE                       0.698285
RSI                          0.564619
Variance                     0.556972
ADOSC                        0.548940
OBV                          0.545842
ROCP                         0.535579
MOM                          0.523893
Name: N, dtype: float64
In [2383]:
df.columns
Out[2383]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [2384]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2385]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2386]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2387]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2388]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2389]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.772291
N                            0.772291
E                            0.772291
O                            0.772291
Verified_status_False        0.771322
Volume                       0.655986
Name: B5_O_Dm, dtype: float64
In [2390]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [2391]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.772291
N                            0.772291
E                            0.772291
O                            0.772291
Verified_status_False        0.771322
Volume                       0.655986
Name: B5_E_Dm, dtype: float64
In [2392]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [2393]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.772291
N                            0.772291
E                            0.772291
O                            0.772291
Verified_status_False        0.771322
Volume                       0.655986
Name: B5_N_Dm, dtype: float64
In [2394]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999976
Volume                       0.861133
B5_E_Dm                      0.772291
B5_O_Dm                      0.772291
B5_N_Dm                      0.772291
Downward_momentum_created    0.772291
TRANGE                       0.698285
RSI                          0.564619
Variance                     0.556972
ADOSC                        0.548940
OBV                          0.545842
ROCP                         0.535579
MOM                          0.523893
Name: Fake_news, dtype: float64
In [2395]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.772291
N                            0.772291
E                            0.772291
O                            0.772291
Verified_status_False        0.771322
Volume                       0.655986
Name: Downward_momentum_created, dtype: float64
In [2396]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2397]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 1 strongly correlated values with Verified_status_True :
Verified_status_True    1.0
Name: Verified_status_True, dtype: float64
In [2398]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999976
N                            0.999976
E                            0.999976
O                            0.999976
Volume                       0.861137
B5_E_Dm                      0.771322
B5_O_Dm                      0.771322
B5_N_Dm                      0.771322
Downward_momentum_created    0.771322
TRANGE                       0.698532
RSI                          0.564748
Variance                     0.557559
ADOSC                        0.549364
OBV                          0.546213
ROCP                         0.535871
MOM                          0.524073
Name: Verified_status_False, dtype: float64
In [2399]:
sns.set(font_scale=0.8)
In [2400]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [2401]:
df.dtypes
Out[2401]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2402]:
df.isnull().sum()
Out[2402]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2403]:
df.fillna(0, inplace = True)
In [2404]:
df.dropna(inplace=True)
In [2405]:
sns.set(font_scale=0.8)
In [2406]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [2407]:
df.describe()
Out[2407]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 355.000000 355.000000 355.000000 355.000000 355.000000 3.550000e+02 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000 3.550000e+02 3.550000e+02 3.550000e+02 355.0 3.550000e+02 355.0 355.0 355.0 355.0 355.0 3.550000e+02 355.0 3.550000e+02 355.0 3.550000e+02 355.000000 355.000000 355.000000 355.0 355.000000 355.0 355.000000 355.000000 355.000000 355.000000 355.000000 355.000000
mean 5.001296 5.187099 4.795099 4.994423 4.994423 3.648486e+06 0.534649 0.678715 0.097084 0.400812 5.449833 4.444074 4.946954 0.134787 7.766984 0.408873 33.718564 0.137676 0.141998 -0.004323 0.168535 3.280472 0.056186 54.304943 0.466406 50.160117 50.722076 50.817118 1.215156e+07 5.982451e+05 8.785687e+07 0.0 2.660801e+05 0.0 0.0 0.0 0.0 0.0 2.660801e+05 0.0 2.660801e+05 0.0 2.660801e+05 0.059155 44.061972 44.121127 0.0 44.121127 0.0 44.121127 44.121127 0.534649 0.003916 0.053155 0.053538
std 2.408832 2.499173 2.309760 2.407203 2.407203 2.484315e+06 5.416505 0.319741 0.143134 0.213214 2.667288 2.145850 2.392430 0.406492 1.776493 0.263777 24.586434 0.267180 0.239728 0.095513 0.888141 7.874844 0.175488 11.154106 0.550907 9.359925 24.881953 23.227869 2.342857e+07 2.444549e+06 2.708814e+07 0.0 2.960758e+05 0.0 0.0 0.0 0.0 0.0 2.960758e+05 0.0 2.960758e+05 0.0 2.960758e+05 0.269744 35.603618 35.717022 0.0 35.717022 0.0 35.717022 35.717022 5.416505 0.053035 0.012535 0.012248
min 1.450000 1.530000 1.420000 1.450000 1.450000 4.928000e+05 -13.976707 -0.153651 0.000453 0.075973 1.583366 1.396441 1.502857 -0.933782 4.627838 0.050000 0.041431 -0.462053 -0.339043 -0.275298 -2.080000 -10.533423 -0.249635 26.292084 -0.344263 25.112203 3.986488 8.049267 -2.262393e+07 -4.227492e+06 4.117650e+07 0.0 8.500000e+01 0.0 0.0 0.0 0.0 0.0 8.500000e+01 0.0 8.500000e+01 0.0 8.500000e+01 0.000000 2.000000 2.000000 0.0 2.000000 0.0 2.000000 2.000000 -13.976707 -0.150552 0.031915 0.031915
25% 2.170000 2.255000 2.050000 2.180000 2.180000 1.828000e+06 -3.097740 0.444000 0.011745 0.163435 2.265599 1.756148 2.042143 -0.059167 6.341419 0.205000 13.033242 -0.021686 -0.016840 -0.069470 -0.275000 -2.216487 -0.070849 46.481453 0.080703 43.914171 28.325922 31.185171 -1.266532e+07 -9.267465e+05 6.025675e+07 0.0 9.531788e+04 0.0 0.0 0.0 0.0 0.0 9.531788e+04 0.0 9.531788e+04 0.0 9.531788e+04 0.000000 21.000000 21.000000 0.0 21.000000 0.0 21.000000 21.000000 -3.097740 -0.031467 0.041820 0.042781
50% 5.360000 5.540000 5.160000 5.340000 5.340000 3.165900e+06 0.000000 0.667942 0.054257 0.402467 5.795863 4.864137 5.357143 0.055064 7.519420 0.390000 28.065619 0.060875 0.074059 -0.002881 0.030000 1.543300 0.006211 52.503675 0.291865 49.506390 51.515204 49.964831 1.424072e+07 -2.086426e+04 9.182500e+07 0.0 1.716309e+05 0.0 0.0 0.0 0.0 0.0 1.716309e+05 0.0 1.716309e+05 0.0 1.716309e+05 0.000000 35.000000 35.000000 0.0 35.000000 0.0 35.000000 35.000000 0.000000 0.000000 0.051194 0.051734
75% 6.495000 6.815000 6.255000 6.505000 6.505000 4.595350e+06 3.678486 0.856480 0.110290 0.587278 7.203764 5.760123 6.444286 0.376154 9.051771 0.560000 54.136604 0.321219 0.320305 0.050556 0.630000 8.105293 0.157590 60.606760 0.857693 56.989237 73.490659 72.458101 2.736544e+07 1.817347e+06 1.068549e+08 0.0 3.147136e+05 0.0 0.0 0.0 0.0 0.0 3.147136e+05 0.0 3.147136e+05 0.0 3.147136e+05 0.000000 56.000000 56.000000 0.0 56.000000 0.0 56.000000 56.000000 3.678486 0.036124 0.062248 0.062248
max 11.100000 11.390000 10.680000 10.990000 10.990000 1.877440e+07 21.283779 2.050666 1.108000 0.799723 11.758292 10.057197 10.684286 1.101987 14.908517 1.340000 94.827860 0.947579 0.807882 0.283740 3.150000 24.675267 0.656566 83.744673 1.782552 79.215986 97.875010 96.361512 6.765090e+07 8.584144e+06 1.471308e+08 0.0 2.083136e+06 0.0 0.0 0.0 0.0 0.0 2.083136e+06 0.0 2.083136e+06 0.0 2.083136e+06 2.000000 238.000000 240.000000 0.0 240.000000 0.0 240.000000 240.000000 21.283779 0.192963 0.083973 0.083973
In [2408]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [2409]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [2410]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [2412]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected UUUU
In [2413]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [2414]:
df.columns
Out[2414]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [2415]:
df.shape
Out[2415]:
(423, 52)
In [2416]:
df.isnull().sum()
Out[2416]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           5
NATR                          0
TRANGE                        0
DMI                           0
MACD                         13
MACDSIGNAL                   13
MACDHIST                     13
MOM                           0
PPO                           5
ROCP                          0
RSI                           0
TRIX                         68
ULTOSC                        8
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [2417]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2418]:
df_weekly = df.resample('W').agg('mean')
In [2419]:
df_weekly.shape
Out[2419]:
(88, 51)
In [2420]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[2420]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0775e56ed0>
In [2421]:
sns.set(font_scale=0.8)
In [2422]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [2423]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with AvgTrueRange:
AvgTrueRange             1.000000
Upperband                0.952269
TRANGE                   0.950027
High                     0.947696
Open                     0.944106
Middleband               0.942305
Adj Close                0.942235
Close                    0.942235
Low                      0.938861
Lowerband                0.923092
OBV                      0.915611
AD                       0.882620
Variance                 0.713967
Volume                   0.677870
Verified_status_False    0.654382
N                        0.654164
O                        0.654164
E                        0.654164
Fake_news                0.654164
MACDSIGNAL               0.558486
NATR                     0.551616
Name: AvgTrueRange, dtype: float64
In [2424]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with NATR :
NATR            1.000000
TRIX            0.690125
Volume          0.558921
AvgTrueRange    0.551616
TRANGE          0.525254
MACDSIGNAL      0.505915
Name: NATR, dtype: float64
In [2425]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with TRANGE:
TRANGE                   1.000000
AvgTrueRange             0.950027
High                     0.909469
Open                     0.901281
Upperband                0.901145
Close                    0.901005
Adj Close                0.901005
OBV                      0.893845
Low                      0.892724
Middleband               0.881967
Lowerband                0.851703
AD                       0.843035
Volume                   0.814197
Verified_status_False    0.799342
N                        0.799086
O                        0.799086
E                        0.799086
Fake_news                0.799086
Variance                 0.764564
MACDSIGNAL               0.577707
MACD                     0.540180
NATR                     0.525254
Name: TRANGE, dtype: float64
In [2426]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 30 strongly correlated values with Openness:
Fake_news                    1.000000
E                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
Volume                       0.908179
TRANGE                       0.799086
B5_O_Dm                      0.767979
Downward_momentum_created    0.767979
B5_E_Dm                      0.767979
B5_N_Dm                      0.767979
Variance                     0.738207
OBV                          0.678026
ADOSC                        0.671945
AvgTrueRange                 0.654164
High                         0.630479
Close                        0.624398
Adj Close                    0.624398
Open                         0.616380
Low                          0.608328
Upperband                    0.600490
MACD                         0.596325
Verified_status_True         0.593649
ROCP                         0.591636
MOM                          0.589431
ULTOSC                       0.584357
AD                           0.567815
RSI                          0.566398
Middleband                   0.560035
Lowerband                    0.505561
Name: O, dtype: float64
In [2427]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [2428]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 30 strongly correlated values with conscientiousness:
Fake_news                    1.000000
E                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
Volume                       0.908179
TRANGE                       0.799086
B5_O_Dm                      0.767979
Downward_momentum_created    0.767979
B5_E_Dm                      0.767979
B5_N_Dm                      0.767979
Variance                     0.738207
OBV                          0.678026
ADOSC                        0.671945
AvgTrueRange                 0.654164
High                         0.630479
Close                        0.624398
Adj Close                    0.624398
Open                         0.616380
Low                          0.608328
Upperband                    0.600490
MACD                         0.596325
Verified_status_True         0.593649
ROCP                         0.591636
MOM                          0.589431
ULTOSC                       0.584357
AD                           0.567815
RSI                          0.566398
Middleband                   0.560035
Lowerband                    0.505561
Name: E, dtype: float64
In [2429]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [2430]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 30 strongly correlated values with conscientiousness:
Fake_news                    1.000000
E                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
Volume                       0.908179
TRANGE                       0.799086
B5_O_Dm                      0.767979
Downward_momentum_created    0.767979
B5_E_Dm                      0.767979
B5_N_Dm                      0.767979
Variance                     0.738207
OBV                          0.678026
ADOSC                        0.671945
AvgTrueRange                 0.654164
High                         0.630479
Close                        0.624398
Adj Close                    0.624398
Open                         0.616380
Low                          0.608328
Upperband                    0.600490
MACD                         0.596325
Verified_status_True         0.593649
ROCP                         0.591636
MOM                          0.589431
ULTOSC                       0.584357
AD                           0.567815
RSI                          0.566398
Middleband                   0.560035
Lowerband                    0.505561
Name: N, dtype: float64
In [2431]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2432]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2433]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2434]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2435]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2436]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.767979
N                            0.767979
E                            0.767979
O                            0.767979
Verified_status_False        0.767897
Volume                       0.719435
ROCP                         0.644368
RSI                          0.637284
MOM                          0.519692
ADOSC                        0.516177
Name: B5_O_Dm, dtype: float64
In [2437]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [2438]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.767979
N                            0.767979
E                            0.767979
O                            0.767979
Verified_status_False        0.767897
Volume                       0.719435
ROCP                         0.644368
RSI                          0.637284
MOM                          0.519692
ADOSC                        0.516177
Name: B5_E_Dm, dtype: float64
In [2439]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [2440]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.767979
N                            0.767979
E                            0.767979
O                            0.767979
Verified_status_False        0.767897
Volume                       0.719435
ROCP                         0.644368
RSI                          0.637284
MOM                          0.519692
ADOSC                        0.516177
Name: B5_N_Dm, dtype: float64
In [2441]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 30 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
E                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
Volume                       0.908179
TRANGE                       0.799086
B5_O_Dm                      0.767979
Downward_momentum_created    0.767979
B5_E_Dm                      0.767979
B5_N_Dm                      0.767979
Variance                     0.738207
OBV                          0.678026
ADOSC                        0.671945
AvgTrueRange                 0.654164
High                         0.630479
Close                        0.624398
Adj Close                    0.624398
Open                         0.616380
Low                          0.608328
Upperband                    0.600490
MACD                         0.596325
Verified_status_True         0.593649
ROCP                         0.591636
MOM                          0.589431
ULTOSC                       0.584357
AD                           0.567815
RSI                          0.566398
Middleband                   0.560035
Lowerband                    0.505561
Name: Fake_news, dtype: float64
In [2442]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.767979
N                            0.767979
E                            0.767979
O                            0.767979
Verified_status_False        0.767897
Volume                       0.719435
ROCP                         0.644368
RSI                          0.637284
MOM                          0.519692
ADOSC                        0.516177
Name: Downward_momentum_created, dtype: float64
In [2443]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2444]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 7 strongly correlated values with Verified_status_True :
Verified_status_True     1.000000
Fake_news                0.593649
N                        0.593649
E                        0.593649
O                        0.593649
Verified_status_False    0.590710
Volume                   0.567377
Name: Verified_status_True, dtype: float64
In [2445]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 30 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999993
E                            0.999993
O                            0.999993
N                            0.999993
Volume                       0.908045
TRANGE                       0.799342
B5_O_Dm                      0.767897
Downward_momentum_created    0.767897
B5_E_Dm                      0.767897
B5_N_Dm                      0.767897
Variance                     0.738429
OBV                          0.678310
ADOSC                        0.672192
AvgTrueRange                 0.654382
High                         0.630866
Close                        0.624787
Adj Close                    0.624787
Open                         0.616768
Low                          0.608722
Upperband                    0.600859
MACD                         0.596232
ROCP                         0.591908
Verified_status_True         0.590710
MOM                          0.589448
ULTOSC                       0.584874
AD                           0.568172
RSI                          0.566520
Middleband                   0.560416
Lowerband                    0.505954
Name: Verified_status_False, dtype: float64
In [2446]:
sns.set(font_scale=0.8)
In [2447]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [2448]:
df_weekly.fillna(0, inplace = True)
In [2449]:
df_weekly.dropna(inplace=True)
In [2450]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [2451]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();